clear all
set more off, perm
set mem 10000000
set matsize 10000
version 14

****************************************************************** 
*** Habitation 2003 Fuzzy Merge (remaining unmatched villages) ***
****************************************************************** 

cd "$hmerge"

****************************************************************** 
*** Step 1:  Hab 2003 unmatched, bk_code_pca *********************
****************************************************************** 
{
use "pca_2001_names_merges_all_temp_unmatched3.dta", clear
tostring  st_code dt_code bk_code*, replace
cap drop stdt
gen stdt = st_code + " " + dt_code
foreach v of varlist village* {
  replace `v' = subinstr(`v',"(","{",.)
  replace `v' = subinstr(`v',")","}",.)
}
replace village_vd = "--DUPLICATE--" if (village_vd==village_pca) & village_vd!=""
replace village11 = "--DUPLICATE--" if (village11==village_vd | village11==village_pca) & village11!=""
replace village_conc01 = "--DUPLICATE--" if (village_conc01==village11 | village_conc01==village_vd | village_conc01==village_pca) & village_conc01!=""
replace village_conc11 = "--DUPLICATE--" if (village_conc11==village_conc01 | village_conc11==village11 | village_conc11==village_vd | village_conc11==village_pca) & village_conc11!=""
replace village_rggvy = "--DUPLICATE--" if (village_rggvy==village_conc11 | village_rggvy==village_conc01 | village_rggvy==village11 | village_rggvy==village_vd | village_rggvy==village_pca) & village_rggvy!=""
replace village_lh = "--DUPLICATE--" if (village_lh==village_rggvy | village_lh==village_conc11 | village_lh==village_conc01 | village_lh==village11 | village_lh==village_vd | village_lh==village_pca) & village_lh!=""
save "temp03_clean.dta", replace

use "$hab3/hab_census03_for_merge_temp_unmatched.dta", clear
tostring  st_code dt_code bk_code*, replace
cap drop stdt
gen stdt = st_code + " " + dt_code
replace village = subinstr(village,"(","{",.)
replace village = subinstr(village,")","}",.)
save "hab_census03_for_merge_temp_unmatched_temp.dta", replace

*Reclink all village names
timer clear
timer on 1
use "temp03_clean.dta", clear
save "temp03.dta", replace

foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_rggvy village_lh {
	cap drop fscore* fmerge* h3v_id* totp_h3* Uvillage*
	duplicates drop
	local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	rename `vi' village
	keep if village!="--DUPLICATE--" & village!=""
	reclink stdt bk_code_pca village using "hab_census03_for_merge_temp_unmatched_temp.dta", idmaster(names_id) ///
		idusing(h3v_id) gen(fscore_pca_`vi_stub') required(stdt bk_code_pca) _merge(fmerge_pca_`vi_stub') minscore(0.9)
	keep if fmerge_pca_`vi_stub'==3
	gen h3v_id_pca_`vi_stub' = h3v_id
	gen totp_h3_pca_`vi_stub' = totp_h3
	rename Uvillage Uvillage_pca_`vi_stub'
	keep names_id Uvillage_pca_`vi_stub' fscore_pca_`vi_stub' fmerge_pca_`vi_stub' h3v_id_pca_`vi_stub' totp_h3_pca_`vi_stub'
	duplicates drop
	merge m:m names_id using "temp03.dta", nogen 
	save "temp03.dta", replace
}
timer off 1
timer list

*Define a single h3v_id for each row
use "temp03.dta", clear
rename village11 village_11
duplicates drop
egen fscore_rowmax = rowmax(fscore_pca_*)
egen fmerge_rowmax = rowmax(fmerge_pca_*)
gen h3v_id = .
gen fscore_pca = .
gen fuzzy_pca = ""
gen Uvillage = ""
gen Mvillage = ""
foreach vi of newlist _pca _vd _11 _conc01 _conc11 _rggvy _lh {
	replace h3v_id = h3v_id_pca`vi' if fmerge_pca`vi'==3 & h3v_id==. & fscore_pca`vi'==fscore_rowmax
	replace fscore_pca = fscore_pca`vi' if fmerge_pca`vi'==3 & fscore_pca==. & fscore_pca`vi'==fscore_rowmax
	replace fuzzy_pca = substr("`vi'",2,10) if fmerge_pca`vi'==3 & fuzzy_pca=="" & fscore_pca`vi'==fscore_rowmax
	replace Uvillage = Uvillage_pca`vi' if fmerge_pca`vi'==3 & Uvillage=="" & fscore_pca`vi'==fscore_rowmax
	replace Mvillage = village`vi' if fmerge_pca`vi'==3 & Mvillage=="" & fscore_pca`vi'==fscore_rowmax
}
rename village_11 village11
assert h3v_id!=. if fmerge_rowmax==3
drop fmerge_* fscore_pca_* fscore_rowmax h3v_id_* totp_h3* Uvillage_pca*
duplicates drop
merge m:1 h3v_id using "hab_census03_for_merge_temp_unmatched_temp.dta", keep(1 3) keepusing(block village totp_h3 maxp_h3 count_h3)
assert Uvillage==village
drop Uvillage
duplicates drop

*Remove duplicates, so a single PCA village doesn't match to multiple hab census villages
duplicates t names_id, gen(dup1)
sort names_id

  // drop the few where there are just straight duplicates of both master and using identifiers
duplicates drop names_id h3v_id, force

  // drop by population
gen pop_diff01 = abs(tot_p - totp_h3)
gen pop_diff11 = abs(tot_p11 - totp_h3)
drop if dup1>0 & names_id==names_id[_n-1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n-1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n-1]==0))
drop if dup1>0 & names_id==names_id[_n+1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n+1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n+1]==0))
drop if dup1>0 & names_id==names_id[_n-1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n-1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n-1]==0))
drop if dup1>0 & names_id==names_id[_n+1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n+1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n+1]==0))
drop if dup1>0 & names_id==names_id[_n-1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n-1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n-1]==0))
drop if dup1>0 & names_id==names_id[_n+1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n+1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n+1]==0))

  // clean names to find identical ones among dups
foreach v of varlist Mvillage village{
  replace `v' = subinstr(`v',"{"," ",.)
  replace `v' = subinstr(`v',"}"," ",.)
  replace `v' = subinstr(`v',"["," ",.)
  replace `v' = subinstr(`v',"]"," ",.)
  replace `v' = subinstr(`v',"."," ",.)
  replace `v' = subinstr(`v',"-"," ",.)
  replace `v' = subinstr(`v',"'"," ",.)
  replace `v' = trim(itrim(`v'))  
}

forvalues iter = 1/3 {
  drop if dup1>0 & names_id==names_id[_n-1] & subinstr(Mvillage[_n-1]," ","",.)==subinstr(village[_n-1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  drop if dup1>0 & names_id==names_id[_n+1] & subinstr(Mvillage[_n+1]," ","",.)==subinstr(village[_n+1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  forvalues x = 1/4 {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x'")==1 & regexm(village[_n-1],"`x'")==1) & regexm(village,"`x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x'")==1 & regexm(village[_n+1],"`x'")==1) & regexm(village,"`x'")!=1
  }
  foreach x of newlist III II I {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1]," `x'")==1 & regexm(village[_n-1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1]," `x'")==1 & regexm(village[_n+1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x' ")==1 & regexm(village[_n-1],"`x' ")==1) & regexm(village,"`x' ")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x' ")==1 & regexm(village[_n+1],"`x' ")==1) & regexm(village,"`x' ")!=1
  }
   foreach x of newlist A B {
    drop if dup1>0 & names_id==names_id[_n-1] & (substr(subinstr(Mvillage[_n-1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n-1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
    drop if dup1>0 & names_id==names_id[_n+1] & (substr(subinstr(Mvillage[_n+1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n+1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
  }   
}

  // drop by fscore
*drop if dup1>0 & names_id==names_id[_n-1] & fscore_pca<fscore_pca[_n-1] & fscore_pca!=. & fscore_pca[_n-1]!=.
*drop if dup1>0 & names_id==names_id[_n+1] & fscore_pca<fscore_pca[_n+1] & fscore_pca!=. & fscore_pca[_n+1]!=.

  // for remaining dups, treat them like quarterbacks: if you have 2, that means you really have 0  (and reset all hab variables)
duplicates t names_id, gen(dup1a)
replace h3v_id = . 		if _merge==3 & dup1a>0
replace block = "" 		if _merge==3 & dup1a>0
replace village = "" 	if _merge==3 & dup1a>0
replace totp_h3 = . 	if _merge==3 & dup1a>0
replace maxp_h3 = . 	if _merge==3 & dup1a>0
replace count_h3 = .	if _merge==3 & dup1a>0
replace fscore_pca = .  if _merge==3 & dup1a>0
replace fuzzy_pca = ""  if _merge==3 & dup1a>0
replace Mvillage = ""   if _merge==3 & dup1a>0
replace pop_diff01 = .  if _merge==3 & dup1a>0
replace pop_diff11 = .  if _merge==3 & dup1a>0
duplicates drop
duplicates r names_id

*Remove duplicates, so a single hab census village doesn't match to multiple PCA villages
duplicates t h3v_id, gen(dup2)
replace dup2 = . if h3v_id==.
sort h3v_id

	// pick the match with the closest 2001 population, within reason
egen min_pop_diff01 = min(pop_diff01), by(h3v_id)
egen min_pop_diff11 = min(pop_diff11), by(h3v_id)
gen dup2_keep = 0
replace dup2_keep = 1 if dup2>0 & dup2!=. & min_pop_diff01==pop_diff01 & (min_pop_diff01<=0.2*tot_p | min_pop_diff01<20)
egen dup2_keep_max1 = max(dup2_keep), by(h3v_id)

	// pick the match with the closest 2011 population, within reason
*br if dup2>0 & dup2<. & dup2_keep_max1==0
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max1==0 & min_pop_diff11==pop_diff11 & (min_pop_diff11<=0.2*tot_p11 | min_pop_diff11<20)
egen dup2_keep_max2 = max(dup2_keep), by(h3v_id)

    // find dups where the total PCA population adds up to hab population (identifies BOTH
	// villages with identical names and non-unique PCA villages in the names master dataset)
*br if dup2>0 & dup2<. & dup2_keep_max2==0
egen sum_pop01 = sum(tot_p), by(h3v_id)
egen sum_pop11 = sum(tot_p11), by(h3v_id)
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max2==0 & (abs(totp_h3 - sum_pop01)<20 | abs(totp_h3 - sum_pop01)<0.2*sum_pop01) 
egen dup2_keep_max3 = max(dup2_keep), by(h3v_id)
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max3==0 & (abs(totp_h3 - sum_pop11)<20 | abs(totp_h3 - sum_pop11)<0.2*sum_pop11)
egen dup2_keep_max4 = max(dup2_keep), by(h3v_id)

	// pick the match with the highest fscore
*br if dup2>0 & dup2<. & dup2_keep_max4==0
egen max_fscore = max(fscore_pca), by(h3v_id)
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max4==0 & fscore_pca==max_fscore & fscore_pca!=. 
egen dup2_keep_max5 = max(dup2_keep), by(h3v_id)

   // reset h3v_id and hab census variables to blanks for the weaker of the duplicate matches
replace h3v_id = . 		if dup2>0 & dup2!=. & dup2_keep==0
replace block = "" 		if dup2>0 & dup2!=. & dup2_keep==0
replace village = "" 	if dup2>0 & dup2!=. & dup2_keep==0
replace totp_h3 = . 	if dup2>0 & dup2!=. & dup2_keep==0
replace maxp_h3 = . 	if dup2>0 & dup2!=. & dup2_keep==0
replace count_h3 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace fuzzy_pca = ""	if dup2>0 & dup2!=. & dup2_keep==0
replace fscore_pca = .	if dup2>0 & dup2!=. & dup2_keep==0
replace pop_diff01 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace pop_diff11 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace sum_pop01 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace sum_pop11 = .	if dup2>0 & dup2!=. & dup2_keep==0

drop dup1* _merge dup2* Mvillage min_pop* max_fscore

rename block block3
rename village village3
rename pop_diff01 pop_diff01_3
rename pop_diff11 pop_diff11_3
rename sum_pop01 sum_pop_01_3
rename sum_pop11 sum_pop_11_3
rename fuzzy_pca fuzzy3_pca
rename fscore_pca fscore3_pca

*Clean up questionable fuzzy matches
gen bad_match = (h3v_id!=. & fscore3_pca<0.97 & pop_diff01_3>20 & pop_diff01_3>0.2*tot_p & pop_diff01_3!=. & pop_diff11_3>20 & pop_diff11_3>0.2*tot_p11 & pop_diff11_3!=.)
replace h3v_id = . 			if bad_match==1
replace block3 = "" 		if bad_match==1
replace village3 = "" 		if bad_match==1
replace totp_h3 = . 		if bad_match==1
replace maxp_h3 = . 		if bad_match==1
replace count_h3 = .		if bad_match==1
replace fuzzy3_pca = ""		if bad_match==1
replace fscore3_pca = .		if bad_match==1
replace pop_diff01_3 = .	if bad_match==1
replace pop_diff11_3 = .	if bad_match==1
replace sum_pop_01_3 = .	if bad_match==1
replace sum_pop_11_3 = .	if bad_match==1
drop bad_match

*Drop duplicates and bad matches, confirm a(n almost) 1-1 match
duplicates drop
duplicates r h3v_id
duplicates r names_id

*Split "names" dataset into matched and unmatched villages
keep if h3v_id!=.
save "temp03_matched1.dta", replace
}

****************************************************************** 
*** Step 2:  Hab 2003 unmatched, bk_code11 ***********************
****************************************************************** 
{
*Reclink all village names
timer clear
timer on 1
use "temp03_clean.dta", clear
save "temp03.dta", replace

foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_rggvy village_lh {
	cap drop fscore* fmerge* h3v_id* totp_h3* Uvillage*
	duplicates drop
	local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	rename `vi' village
	keep if village!="--DUPLICATE--" & village!=""
	reclink stdt bk_code11 village using "hab_census03_for_merge_temp_unmatched_temp.dta", idmaster(names_id) ///
		idusing(h3v_id) gen(fscore_11_`vi_stub') required(stdt bk_code11) _merge(fmerge_11_`vi_stub') minscore(0.9)
	keep if fmerge_11_`vi_stub'==3
	gen h3v_id_11_`vi_stub' = h3v_id
	gen totp_h3_11_`vi_stub' = totp_h3
	rename Uvillage Uvillage_11_`vi_stub'
	keep names_id Uvillage_11_`vi_stub' fscore_11_`vi_stub' fmerge_11_`vi_stub' h3v_id_11_`vi_stub' totp_h3_11_`vi_stub'
	duplicates drop
	merge m:m names_id using "temp03.dta", nogen 
	save "temp03.dta", replace
}
timer off 1
timer list

*Define a single h3v_id for each row
use "temp03.dta", clear
rename village11 village_11
duplicates drop
egen fscore_rowmax = rowmax(fscore_11_*)
egen fmerge_rowmax = rowmax(fmerge_11_*)
gen h3v_id = .
gen fscore_11 = .
gen fuzzy_11 = ""
gen Uvillage = ""
gen Mvillage = ""
foreach vi of newlist _pca _vd _11 _conc01 _conc11 _rggvy _lh {
	replace h3v_id = h3v_id_11`vi' if fmerge_11`vi'==3 & h3v_id==. & fscore_11`vi'==fscore_rowmax
	replace fscore_11 = fscore_11`vi' if fmerge_11`vi'==3 & fscore_11==. & fscore_11`vi'==fscore_rowmax
	replace fuzzy_11 = substr("`vi'",2,10) if fmerge_11`vi'==3 & fuzzy_11=="" & fscore_11`vi'==fscore_rowmax
	replace Uvillage = Uvillage_11`vi' if fmerge_11`vi'==3 & Uvillage=="" & fscore_11`vi'==fscore_rowmax
	replace Mvillage = village`vi' if fmerge_11`vi'==3 & Mvillage=="" & fscore_11`vi'==fscore_rowmax
}
rename village_11 village11
assert h3v_id!=. if fmerge_rowmax==3
drop fmerge_* fscore_11_* fscore_rowmax h3v_id_* totp_h3* Uvillage_11*
duplicates drop
merge m:1 h3v_id using "hab_census03_for_merge_temp_unmatched_temp.dta", keep(1 3) keepusing(block village totp_h3 maxp_h3 count_h3)
assert Uvillage==village
drop Uvillage
duplicates drop

*Remove duplicates, so a single PCA village doesn't match to multiple hab census villages
duplicates t names_id, gen(dup1)
sort names_id

  // drop the few where there are just straight duplicates of both master and using identifiers
duplicates drop names_id h3v_id, force

  // drop by population
gen pop_diff01 = abs(tot_p - totp_h3)
gen pop_diff11 = abs(tot_p11 - totp_h3)
drop if dup1>0 & names_id==names_id[_n-1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n-1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n-1]==0))
drop if dup1>0 & names_id==names_id[_n+1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n+1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n+1]==0))
drop if dup1>0 & names_id==names_id[_n-1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n-1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n-1]==0))
drop if dup1>0 & names_id==names_id[_n+1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n+1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n+1]==0))
drop if dup1>0 & names_id==names_id[_n-1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n-1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n-1]==0))
drop if dup1>0 & names_id==names_id[_n+1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n+1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n+1]==0))

  // clean names to find identical ones among dups
foreach v of varlist Mvillage village{
  replace `v' = subinstr(`v',"{"," ",.)
  replace `v' = subinstr(`v',"}"," ",.)
  replace `v' = subinstr(`v',"["," ",.)
  replace `v' = subinstr(`v',"]"," ",.)
  replace `v' = subinstr(`v',"."," ",.)
  replace `v' = subinstr(`v',"-"," ",.)
  replace `v' = subinstr(`v',"'"," ",.)
  replace `v' = trim(itrim(`v'))  
}

forvalues iter = 1/3 {
  drop if dup1>0 & names_id==names_id[_n-1] & subinstr(Mvillage[_n-1]," ","",.)==subinstr(village[_n-1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  drop if dup1>0 & names_id==names_id[_n+1] & subinstr(Mvillage[_n+1]," ","",.)==subinstr(village[_n+1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  forvalues x = 1/4 {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x'")==1 & regexm(village[_n-1],"`x'")==1) & regexm(village,"`x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x'")==1 & regexm(village[_n+1],"`x'")==1) & regexm(village,"`x'")!=1
  }
  foreach x of newlist III II I {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1]," `x'")==1 & regexm(village[_n-1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1]," `x'")==1 & regexm(village[_n+1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x' ")==1 & regexm(village[_n-1],"`x' ")==1) & regexm(village,"`x' ")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x' ")==1 & regexm(village[_n+1],"`x' ")==1) & regexm(village,"`x' ")!=1
  }
   foreach x of newlist A B {
    drop if dup1>0 & names_id==names_id[_n-1] & (substr(subinstr(Mvillage[_n-1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n-1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
    drop if dup1>0 & names_id==names_id[_n+1] & (substr(subinstr(Mvillage[_n+1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n+1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
  }   
}

  // drop by fscore
*drop if dup1>0 & names_id==names_id[_n-1] & fscore_11<fscore_11[_n-1] & fscore_11!=. & fscore_11[_n-1]!=.
*drop if dup1>0 & names_id==names_id[_n+1] & fscore_11<fscore_11[_n+1] & fscore_11!=. & fscore_11[_n+1]!=.

  // for remaining dups, treat them like quarterbacks: if you have 2, that means you really have 0  (and reset all hab variables)
duplicates t names_id, gen(dup1a)
replace h3v_id = . 		if _merge==3 & dup1a>0
replace block = "" 		if _merge==3 & dup1a>0
replace village = "" 	if _merge==3 & dup1a>0
replace totp_h3 = . 	if _merge==3 & dup1a>0
replace maxp_h3 = . 	if _merge==3 & dup1a>0
replace count_h3 = .	if _merge==3 & dup1a>0
replace fscore_11 = .  	if _merge==3 & dup1a>0
replace fuzzy_11 = ""  	if _merge==3 & dup1a>0
replace Mvillage = ""   if _merge==3 & dup1a>0
replace pop_diff01 = .  if _merge==3 & dup1a>0
replace pop_diff11 = .  if _merge==3 & dup1a>0
duplicates drop
duplicates r names_id

*Remove duplicates, so a single hab census village doesn't match to multiple PCA villages
duplicates t h3v_id, gen(dup2)
replace dup2 = . if h3v_id==.
sort h3v_id

	// pick the match with the closest 2001 population, within reason
egen min_pop_diff01 = min(pop_diff01), by(h3v_id)
egen min_pop_diff11 = min(pop_diff11), by(h3v_id)
gen dup2_keep = 0
replace dup2_keep = 1 if dup2>0 & dup2!=. & min_pop_diff01==pop_diff01 & (min_pop_diff01<=0.2*tot_p | min_pop_diff01<20)
egen dup2_keep_max1 = max(dup2_keep), by(h3v_id)

	// pick the match with the closest 2011 population, within reason
*br if dup2>0 & dup2<. & dup2_keep_max1==0
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max1==0 & min_pop_diff11==pop_diff11 & (min_pop_diff11<=0.2*tot_p11 | min_pop_diff11<20)
egen dup2_keep_max2 = max(dup2_keep), by(h3v_id)

    // find dups where the total PCA population adds up to hab population (identifies BOTH
	// villages with identical names and non-unique PCA villages in the names master dataset)
*br if dup2>0 & dup2<. & dup2_keep_max2==0
egen sum_pop01 = sum(tot_p), by(h3v_id)
egen sum_pop11 = sum(tot_p11), by(h3v_id)
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max2==0 & (abs(totp_h3 - sum_pop01)<20 | abs(totp_h3 - sum_pop01)<0.2*sum_pop01) 
egen dup2_keep_max3 = max(dup2_keep), by(h3v_id)
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max3==0 & (abs(totp_h3 - sum_pop11)<20 | abs(totp_h3 - sum_pop11)<0.2*sum_pop11)
egen dup2_keep_max4 = max(dup2_keep), by(h3v_id)

	// pick the match with the highest fscore
*br if dup2>0 & dup2<. & dup2_keep_max4==0
egen max_fscore = max(fscore_11), by(h3v_id)
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max4==0 & fscore_11==max_fscore & fscore_11!=. 
egen dup2_keep_max5 = max(dup2_keep), by(h3v_id)

   // reset h3v_id and hab census variables to blanks for the weaker of the duplicate matches
replace h3v_id = . 		if dup2>0 & dup2!=. & dup2_keep==0
replace block = "" 		if dup2>0 & dup2!=. & dup2_keep==0
replace village = "" 	if dup2>0 & dup2!=. & dup2_keep==0
replace totp_h3 = . 	if dup2>0 & dup2!=. & dup2_keep==0
replace maxp_h3 = . 	if dup2>0 & dup2!=. & dup2_keep==0
replace count_h3 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace fuzzy_11 = ""	if dup2>0 & dup2!=. & dup2_keep==0
replace fscore_11 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace pop_diff01 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace pop_diff11 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace sum_pop01 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace sum_pop11 = .	if dup2>0 & dup2!=. & dup2_keep==0

drop dup1* _merge dup2* Mvillage min_pop* max_fscore

rename block block3
rename village village3
rename pop_diff01 pop_diff01_3
rename pop_diff11 pop_diff11_3
rename sum_pop01 sum_pop_01_3
rename sum_pop11 sum_pop_11_3
rename fuzzy_11 fuzzy3_11
rename fscore_11 fscore3_11

*Clean up questionable fuzzy matches
gen bad_match = (h3v_id!=. & fscore3_11<0.97 & pop_diff01_3>20 & pop_diff01_3>0.2*tot_p & pop_diff01_3!=. & pop_diff11_3>20 & pop_diff11_3>0.2*tot_p11 & pop_diff11_3!=.)
replace h3v_id = . 			if bad_match==1
replace block3 = "" 		if bad_match==1
replace village3 = "" 		if bad_match==1
replace totp_h3 = . 		if bad_match==1
replace maxp_h3 = . 		if bad_match==1
replace count_h3 = .		if bad_match==1
replace fuzzy3_11 = ""		if bad_match==1
replace fscore3_11 = .		if bad_match==1
replace pop_diff01_3 = .	if bad_match==1
replace pop_diff11_3 = .	if bad_match==1
replace sum_pop_01_3 = .	if bad_match==1
replace sum_pop_11_3 = .	if bad_match==1
drop bad_match

*Drop duplicates and bad matches, confirm a(n almost) 1-1 match
duplicates drop
duplicates r h3v_id
duplicates r names_id

*Split "names" dataset into matched and unmatched villages
keep if h3v_id!=.
save "temp03_matched2.dta", replace
}

****************************************************************** 
*** Step 3:  Hab 2003 unmatched, bk_code_rggvy *******************
****************************************************************** 
{
*Reclink all village names
timer clear
timer on 1
use "temp03_clean.dta", clear
save "temp03.dta", replace

foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_rggvy village_lh {
	cap drop fscore* fmerge* h3v_id* totp_h3* Uvillage*
	duplicates drop
	local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	rename `vi' village
	keep if village!="--DUPLICATE--" & village!=""
	reclink stdt bk_code_rggvy village using "hab_census03_for_merge_temp_unmatched_temp.dta", idmaster(names_id) ///
		idusing(h3v_id) gen(fscore_rggvy_`vi_stub') required(stdt bk_code_rggvy) _merge(fmerge_rggvy_`vi_stub') minscore(0.9)
	keep if fmerge_rggvy_`vi_stub'==3
	gen h3v_id_rggvy_`vi_stub' = h3v_id
	gen totp_h3_rggvy_`vi_stub' = totp_h3
	rename Uvillage Uvillage_rggvy_`vi_stub'
	keep names_id Uvillage_rggvy_`vi_stub' fscore_rggvy_`vi_stub' fmerge_rggvy_`vi_stub' h3v_id_rggvy_`vi_stub' totp_h3_rggvy_`vi_stub'
	duplicates drop
	merge m:m names_id using "temp03.dta", nogen 
	save "temp03.dta", replace
}
timer off 1
timer list

*Define a single h3v_id for each row
use "temp03.dta", clear
rename village11 village_11
duplicates drop
egen fscore_rowmax = rowmax(fscore_rggvy_*)
egen fmerge_rowmax = rowmax(fmerge_rggvy_*)
gen h3v_id = .
gen fscore_rggvy = .
gen fuzzy_rggvy = ""
gen Uvillage = ""
gen Mvillage = ""
foreach vi of newlist _pca _vd _11 _conc01 _conc11 _rggvy _lh {
	replace h3v_id = h3v_id_rggvy`vi' if fmerge_rggvy`vi'==3 & h3v_id==. & fscore_rggvy`vi'==fscore_rowmax
	replace fscore_rggvy = fscore_rggvy`vi' if fmerge_rggvy`vi'==3 & fscore_rggvy==. & fscore_rggvy`vi'==fscore_rowmax
	replace fuzzy_rggvy = substr("`vi'",2,10) if fmerge_rggvy`vi'==3 & fuzzy_rggvy=="" & fscore_rggvy`vi'==fscore_rowmax
	replace Uvillage = Uvillage_rggvy`vi' if fmerge_rggvy`vi'==3 & Uvillage=="" & fscore_rggvy`vi'==fscore_rowmax
	replace Mvillage = village`vi' if fmerge_rggvy`vi'==3 & Mvillage=="" & fscore_rggvy`vi'==fscore_rowmax
}
rename village_11 village11
assert h3v_id!=. if fmerge_rowmax==3
drop fmerge_* fscore_rggvy_* fscore_rowmax h3v_id_* totp_h3* Uvillage_rggvy*
duplicates drop
merge m:1 h3v_id using "hab_census03_for_merge_temp_unmatched_temp.dta", keep(1 3) keepusing(block village totp_h3 maxp_h3 count_h3)
assert Uvillage==village
drop Uvillage
duplicates drop

*Remove duplicates, so a single PCA village doesn't match to multiple hab census villages
duplicates t names_id, gen(dup1)
sort names_id

  // drop the few where there are just straight duplicates of both master and using identifiers
duplicates drop names_id h3v_id, force

  // drop by population
gen pop_diff01 = abs(tot_p - totp_h3)
gen pop_diff11 = abs(tot_p11 - totp_h3)
drop if dup1>0 & names_id==names_id[_n-1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n-1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n-1]==0))
drop if dup1>0 & names_id==names_id[_n+1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n+1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n+1]==0))
drop if dup1>0 & names_id==names_id[_n-1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n-1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n-1]==0))
drop if dup1>0 & names_id==names_id[_n+1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n+1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n+1]==0))
drop if dup1>0 & names_id==names_id[_n-1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n-1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n-1]==0))
drop if dup1>0 & names_id==names_id[_n+1] & ((pop_diff01!=. & pop_diff01>0 & pop_diff01[_n+1]==0) | (pop_diff11!=. & pop_diff11>0 & pop_diff11[_n+1]==0))

  // clean names to find identical ones among dups
foreach v of varlist Mvillage village{
  replace `v' = subinstr(`v',"{"," ",.)
  replace `v' = subinstr(`v',"}"," ",.)
  replace `v' = subinstr(`v',"["," ",.)
  replace `v' = subinstr(`v',"]"," ",.)
  replace `v' = subinstr(`v',"."," ",.)
  replace `v' = subinstr(`v',"-"," ",.)
  replace `v' = subinstr(`v',"'"," ",.)
  replace `v' = trim(itrim(`v'))  
}

forvalues iter = 1/3 {
  drop if dup1>0 & names_id==names_id[_n-1] & subinstr(Mvillage[_n-1]," ","",.)==subinstr(village[_n-1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  drop if dup1>0 & names_id==names_id[_n+1] & subinstr(Mvillage[_n+1]," ","",.)==subinstr(village[_n+1]," ","",.) & subinstr(Mvillage," ","",.)!=subinstr(village," ","",.)
  forvalues x = 1/4 {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x'")==1 & regexm(village[_n-1],"`x'")==1) & regexm(village,"`x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x'")==1 & regexm(village[_n+1],"`x'")==1) & regexm(village,"`x'")!=1
  }
  foreach x of newlist III II I {
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1]," `x'")==1 & regexm(village[_n-1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1]," `x'")==1 & regexm(village[_n+1]," `x'")==1) & regexm(village," `x'")!=1
    drop if dup1>0 & names_id==names_id[_n-1] & (regexm(Mvillage[_n-1],"`x' ")==1 & regexm(village[_n-1],"`x' ")==1) & regexm(village,"`x' ")!=1
    drop if dup1>0 & names_id==names_id[_n+1] & (regexm(Mvillage[_n+1],"`x' ")==1 & regexm(village[_n+1],"`x' ")==1) & regexm(village,"`x' ")!=1
  }
   foreach x of newlist A B {
    drop if dup1>0 & names_id==names_id[_n-1] & (substr(subinstr(Mvillage[_n-1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n-1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
    drop if dup1>0 & names_id==names_id[_n+1] & (substr(subinstr(Mvillage[_n+1]," ","",.),-1,1)=="`x'" & substr(subinstr(village[_n+1]," ","",.),-1,1)=="`x'") &  substr(subinstr(village," ","",.),-1,1)!="`x'"
  }   
}

  // drop by fscore
*drop if dup1>0 & names_id==names_id[_n-1] & fscore_rggvy<fscore_rggvy[_n-1] & fscore_rggvy!=. & fscore_rggvy[_n-1]!=.
*drop if dup1>0 & names_id==names_id[_n+1] & fscore_rggvy<fscore_rggvy[_n+1] & fscore_rggvy!=. & fscore_rggvy[_n+1]!=.

  // for remaining dups, treat them like quarterbacks: if you have 2, that means you really have 0  (and reset all hab variables)
duplicates t names_id, gen(dup1a)
replace h3v_id = . 		if _merge==3 & dup1a>0
replace block = "" 		if _merge==3 & dup1a>0
replace village = "" 	if _merge==3 & dup1a>0
replace totp_h3 = . 	if _merge==3 & dup1a>0
replace maxp_h3 = . 	if _merge==3 & dup1a>0
replace count_h3 = .	if _merge==3 & dup1a>0
replace fscore_rggvy = .  	if _merge==3 & dup1a>0
replace fuzzy_rggvy = ""  	if _merge==3 & dup1a>0
replace Mvillage = ""   if _merge==3 & dup1a>0
replace pop_diff01 = .  if _merge==3 & dup1a>0
replace pop_diff11 = .  if _merge==3 & dup1a>0
duplicates drop
duplicates r names_id

*Remove duplicates, so a single hab census village doesn't match to multiple PCA villages
duplicates t h3v_id, gen(dup2)
replace dup2 = . if h3v_id==.
sort h3v_id

	// pick the match with the closest 2001 population, within reason
egen min_pop_diff01 = min(pop_diff01), by(h3v_id)
egen min_pop_diff11 = min(pop_diff11), by(h3v_id)
gen dup2_keep = 0
replace dup2_keep = 1 if dup2>0 & dup2!=. & min_pop_diff01==pop_diff01 & (min_pop_diff01<=0.2*tot_p | min_pop_diff01<20)
egen dup2_keep_max1 = max(dup2_keep), by(h3v_id)

	// pick the match with the closest 2011 population, within reason
*br if dup2>0 & dup2<. & dup2_keep_max1==0
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max1==0 & min_pop_diff11==pop_diff11 & (min_pop_diff11<=0.2*tot_p11 | min_pop_diff11<20)
egen dup2_keep_max2 = max(dup2_keep), by(h3v_id)

    // find dups where the total PCA population adds up to hab population (identifies BOTH
	// villages with identical names and non-unique PCA villages in the names master dataset)
*br if dup2>0 & dup2<. & dup2_keep_max2==0
egen sum_pop01 = sum(tot_p), by(h3v_id)
egen sum_pop11 = sum(tot_p11), by(h3v_id)
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max2==0 & (abs(totp_h3 - sum_pop01)<20 | abs(totp_h3 - sum_pop01)<0.2*sum_pop01) 
egen dup2_keep_max3 = max(dup2_keep), by(h3v_id)
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max3==0 & (abs(totp_h3 - sum_pop11)<20 | abs(totp_h3 - sum_pop11)<0.2*sum_pop11)
egen dup2_keep_max4 = max(dup2_keep), by(h3v_id)

	// pick the match with the highest fscore
*br if dup2>0 & dup2<. & dup2_keep_max4==0
egen max_fscore = max(fscore_rggvy), by(h3v_id)
replace dup2_keep = 1 if dup2>0 & dup2!=. & dup2_keep_max4==0 & fscore_rggvy==max_fscore & fscore_rggvy!=. 
egen dup2_keep_max5 = max(dup2_keep), by(h3v_id)

   // reset h3v_id and hab census variables to blanks for the weaker of the duplicate matches
replace h3v_id = . 		if dup2>0 & dup2!=. & dup2_keep==0
replace block = "" 		if dup2>0 & dup2!=. & dup2_keep==0
replace village = "" 	if dup2>0 & dup2!=. & dup2_keep==0
replace totp_h3 = . 	if dup2>0 & dup2!=. & dup2_keep==0
replace maxp_h3 = . 	if dup2>0 & dup2!=. & dup2_keep==0
replace count_h3 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace fuzzy_rggvy = ""	if dup2>0 & dup2!=. & dup2_keep==0
replace fscore_rggvy = .	if dup2>0 & dup2!=. & dup2_keep==0
replace pop_diff01 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace pop_diff11 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace sum_pop01 = .	if dup2>0 & dup2!=. & dup2_keep==0
replace sum_pop11 = .	if dup2>0 & dup2!=. & dup2_keep==0

drop dup1* _merge dup2* Mvillage min_pop* max_fscore

rename block block3
rename village village3
rename pop_diff01 pop_diff01_3
rename pop_diff11 pop_diff11_3
rename sum_pop01 sum_pop_01_3
rename sum_pop11 sum_pop_11_3
rename fuzzy_rggvy fuzzy3_rggvy
rename fscore_rggvy fscore3_rggvy

*Clean up questionable fuzzy matches
gen bad_match = (h3v_id!=. & fscore3_rggvy<0.97 & pop_diff01_3>20 & pop_diff01_3>0.2*tot_p & pop_diff01_3!=. & pop_diff11_3>20 & pop_diff11_3>0.2*tot_p11 & pop_diff11_3!=.)
replace h3v_id = . 			if bad_match==1
replace block3 = "" 		if bad_match==1
replace village3 = "" 		if bad_match==1
replace totp_h3 = . 		if bad_match==1
replace maxp_h3 = . 		if bad_match==1
replace count_h3 = .		if bad_match==1
replace fuzzy3_rggvy = ""		if bad_match==1
replace fscore3_rggvy = .		if bad_match==1
replace pop_diff01_3 = .	if bad_match==1
replace pop_diff11_3 = .	if bad_match==1
replace sum_pop_01_3 = .	if bad_match==1
replace sum_pop_11_3 = .	if bad_match==1
drop bad_match

*Drop duplicates and bad matches, confirm a(n almost) 1-1 match
duplicates drop
duplicates r h3v_id
duplicates r names_id

*Split "names" dataset into matched and unmatched villages
keep if h3v_id!=.
save "temp03_matched3.dta", replace
}

****************************************************************** 
*** Step 4:  Append 3 reclink matched datasets for export ********
****************************************************************** 
{
use "temp03_matched1.dta", clear
append using "temp03_matched2.dta"
append using "temp03_matched3.dta"
save "pca_2001_names_matched_reclink_03.dta", replace
}

****************************************************************** 
*** Step 5:  Masala Merge on remaining Hab 2003 unmatched ********
****************************************************************** 
{ 
do "$path_code/merge/masala_merge_lp_server.do"
set more off, perm

  // NOTE: The masala_merge_lp_server.do program is buggy, but it works wonders. It is modified from the original version 
  // written by Sam Novosad, which he generously shared with us. It is crucial that the 2003 and 2009 masala_merge steps not 
  // be run simultaneously on multiple instances of Stata, or else Python will overwrite one with the other. Be sure to "clear all"
  // before running masala merge, to reset locals and globals. This whole step takes AT LEAST 48 hours to run completely, so be patient.
  
*Prep unmatched names for masala merge
use "pca_2001_names_merges_all_temp_unmatched3.dta", clear
tostring  st_code dt_code bk_code* , replace
cap drop stdt
gen stdt = st_code + " " + dt_code
foreach v of varlist village* {
  replace `v' = subinstr(`v',"(","{",.)
  replace `v' = subinstr(`v',")","}",.)
}
replace village_vd = "--DUPLICATE--" if (village_vd==village_pca) & village_vd!=""
replace village11 = "--DUPLICATE--" if (village11==village_vd | village11==village_pca) & village11!=""
replace village_conc01 = "--DUPLICATE--" if (village_conc01==village11 | village_conc01==village_vd | village_conc01==village_pca) & village_conc01!=""
replace village_conc11 = "--DUPLICATE--" if (village_conc11==village_conc01 | village_conc11==village11 | village_conc11==village_vd | village_conc11==village_pca) & village_conc11!=""
replace village_rggvy = "--DUPLICATE--" if (village_rggvy==village_conc11 | village_rggvy==village_conc01 | village_rggvy==village11 | village_rggvy==village_vd | village_rggvy==village_pca) & village_rggvy!=""
replace village_lh = "--DUPLICATE--" if (village_lh==village_rggvy | village_lh==village_conc11 | village_lh==village_conc01 | village_lh==village11 | village_lh==village_vd | village_lh==village_pca) & village_lh!=""
save "temp03_clean.dta", replace
save "temp03.dta", replace

use "$hab3/hab_census03_for_merge_temp_unmatched.dta", clear
tostring  st_code dt_code bk_code*, replace
cap drop stdt
gen stdt = st_code + " " + dt_code
replace village = subinstr(village,"(","{",.)
replace village = subinstr(village,")","}",.)
save "hab_census03_for_merge_temp_unmatched_temp.dta", replace

*Masala merges
qui use "temp03.dta", clear
foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_rggvy village_lh {
	qui use "temp03.dta", clear
	qui local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	qui rename `vi' village
	qui keep if village!="--DUPLICATE--" & village!=""
	masala_merge2 stdt bk_code_pca using "hab_census03_for_merge_temp_unmatched_temp.dta", s1(village) outfile(out03_pca_`vi_stub') dist(5) quietly
	di "Masala Merge bk_code_pca `vi_stub' complete!"
	keep if _masala_merge==3
	rename village `vi'
	rename village_using village3
	rename block block3
	gen bk = "pca"
	gen vi = "`vi'"
	drop merge*
    save "MMout03_pca_`vi_stub'.dta", replace
}


qui use "temp03.dta", clear
foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_rggvy village_lh {
	qui use "temp03.dta", clear
	qui local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	qui rename `vi' village
	qui keep if village!="--DUPLICATE--" & village!=""
	masala_merge2 stdt bk_code11 using "hab_census03_for_merge_temp_unmatched_temp.dta", s1(village) outfile(out03_11_`vi_stub') dist(5) quietly
	di "Masala Merge bk_code11 `vi_stub' complete!"
	keep if _masala_merge==3
	rename village `vi'
	rename village_using village3
	rename block block3
	gen bk = "11"
	gen vi = "`vi'"
	drop merge*
    save "MMout03_11_`vi_stub'.dta", replace
}

qui use "temp03.dta", clear
foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_rggvy /*village_lh*/ {
	qui use "temp03.dta", clear
	qui local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	qui rename `vi' village
	qui keep if village!="--DUPLICATE--" & village!=""
	masala_merge2 stdt bk_code_rggvy using "hab_census03_for_merge_temp_unmatched_temp.dta", s1(village) outfile(out03_rggvy_`vi_stub') dist(5) quietly
	di "Masala Merge bk_code_rggvy `vi_stub' complete!"
	keep if _masala_merge==3
	rename village `vi'
	rename village_using village3
	rename block block3
	gen bk = "rggvy"
	gen vi = "`vi'"
	drop merge*
    save "MMout03_rggvy_`vi_stub'.dta", replace
}

qui use "temp03.dta", clear
foreach vi of varlist village_pca village_vd village11 village_conc01 village_conc11 village_rggvy village_lh {
	qui use "temp03.dta", clear
	qui local vi_stub = subinstr(substr("`vi'",8,99),"_","",.)
	qui rename `vi' village
	qui keep if village!="--DUPLICATE--" & village!=""
	masala_merge2 stdt using "hab_census03_for_merge_temp_unmatched_temp.dta", s1(village) outfile(out03_nobk_`vi_stub') dist(5) quietly
	di "Masala Merge no bk_code `vi_stub' complete!"
	keep if _masala_merge==3
	rename village `vi'
	rename village_using village3
	rename block block3
	gen bk = "nobk"
	gen vi = "`vi'"
	drop merge*
    save "MMout03_nobk_`vi_stub'.dta", replace
}

cap erase "MMout03_all.dta"
foreach bk of newlist _pca _11 _rggvy _nobk {
  foreach vi of newlist _pca _vd _11 _conc01 _conc11 _rggvy _lh {
    clear
	cap use "MMout03`bk'`vi'.dta"
	cap append using "MMout03_all.dta"
	cap save "MMout03_all.dta", replace
	cap erase "MMout03`bk'`vi'.dta"
  }
}
}

****************************************************************** 
******************************************************************

